import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import numpy as np
from scipy import stats
import os
import re
import math
%matplotlib inline
#plt.style.use('ggplot')
plt.style.use('seaborn-darkgrid')
#sns.set_context("paper", font_scale=4.0)
sns.set_context("paper", font_scale=2.0)
#sns.set_palette("cubehelix", 8)
sns.set_palette("Paired")
# http://www.r-graph-gallery.com/38-rcolorbrewers-palettes/
%cd /projects/kesara/collaborators/Peter/AS-SD
%pwd
# DESeq: this script reads DESeq ouput files for all 45 tissues (male vs female) and creates table and dataframe
# of p-value and log2FC. Further, using p-value matrix, this script also generates combined p-value and zscore of genes.
def dframe(line):
tissue = os.path.basename(line.strip())
tissue = re.sub(r"DE_result_|.txt","", tissue)
print tissue
df = pd.read_table(line.strip())
df.reset_index(level=0, inplace=True)
# p-values
p = df[['index','P.Value']]
p = p.rename(columns={'index':'Gene', 'logFC':tissue})
#print p.head()
# filter sig. DE genes based on p-value and logFC
#de = df[(df['adj.P.Val']<=0.1) & (abs(df['logFC']>=0.575))]
de = df[df['adj.P.Val']<=0.1]
de = de[['index','logFC']]
de = de.rename(columns={'index':'Gene', 'logFC':tissue})
return de,p
# read DESeq ouptut files
f = open ('downloads/guy/deseq/deseq_list.txt','r')
line = f.readline()
fc,p = dframe(line)
for line in f:
df_fc, df_p = dframe(line.strip())
fc = pd.merge(fc, df_fc, how='outer', on='Gene')
p = pd.merge(p, df_p, how='outer', on='Gene')
f.close()
#fc = fc.dropna(how='all')
#p = p.dropna(how='all')
#p = p.fillna(1) # for tissues where genes were not filtered out due to few reads
p = p.set_index('Gene')
### write FC
#fc['NS'] = fc.isnull().sum(axis=1)
#fc = fc.sort_values(by=['NS'], ascending=True)
fc.to_csv('results/deseq/DE_Sig.FDR.1.tsv',sep='\t',index=False)
# combined pval (order statistics)
p['ret'] = p.apply(np.asarray,dtype=float).apply(lambda x: scipy.stats.combine_pvalues(x, method='fisher', weights=None), axis=1)
p['zscore'] = p['ret'].apply(lambda x: x[0])
p['comb_p'] = p['ret'].apply(lambda x: x[1])
p = p.drop(['ret'], axis=1)
p = p.dropna(how='any')
#p = p.sort_values(by=['comb_p'], ascending=True)
p = p.sort_values(by=['zscore'], ascending=False)
p.to_csv('results/deseq/comb_pvalue.tsv',sep='\t')
p.loc[p['comb_p']<=0.1, ['zscore']].to_csv('results/deseq/comb_pvalue_10_zscore.rnk',sep='\t')
#print 'top 10 genes..'
#print p[['zscore', 'comb_p']].head(50)
#p['zscore'].to_csv('results/deseq/ranked_gene_zscore.rnk',sep='\t')
#print "Search"
#print p.loc['FAM86C1']
### RBPs
rbp = pd.read_table('ref/RBP_short_list_Human_Mouse.txt')
rbp_fc = fc[fc.Gene.isin(rbp['HumanGeneName'].tolist())]
rbp_fc.to_csv('results/deseq/DE_Sig.RBP.FDR.1.tsv',sep='\t',index=False)
p.shape
p.tail()
p.head(20)
p.loc[p['comb_p']<=0.1, ['comb_p']].shape # # genes with comb_p <= 0.1 used for GSEA
# http://software.broadinstitute.org/gsea/msigdb/collections.jsp#H
# top genes are sorted based on zscore (as calcuated by Fisher's method)
#p[['zscore', 'comb_p']].head()
gene_list = p.head(20).index.tolist()
top = fc[fc.Gene.isin(gene_list)].set_index('Gene')
top
plt.figure(figsize=(20,10))
sns.heatmap(top, cmap="Spectral", vmin=-1, vmax=1)
# top 20 genes; but transform -0.3 to 0.3 logFC as NaN
top_2 = top.apply(lambda x: [i if abs(i) > 0.3 else np.nan for i in x])
plt.figure(figsize=(20,10))
sns.heatmap(top_2, cmap="Spectral", vmin=-1, vmax=1)
p['comb_p'].plot(kind='hist')
rbp = pd.read_table('results/deseq/DE_Sig.RBP.FDR.1.tsv') # FDR 10%
rbp = rbp.set_index('Gene')
rbp = rbp.iloc[:,0:-1]
rbp.head()
rbp.shape
#from matplotlib.colors import LogNorm
plt.subplots(figsize=(25,25))
#sns.heatmap(rbp, cmap="YlGnBu")
#sns.heatmap(rbp, cmap="PiYG")
sns.heatmap(rbp, cmap="Spectral", vmin=-0.5, vmax=0.5)
# suggestions from Guru: set logFC from +0.3 to - 0.3 as Nan, and scale down logFC > 0.5 to 0.5 for the better heatmap visualization
rbp_2 = rbp.apply(lambda x: [0.5 if abs(i) > 0.5 else i for i in x])
rbp_2 = rbp.apply(lambda x: [i if abs(i) > 0.3 else np.NaN for i in x])
rbp_2.head()
plt.subplots(figsize=(25,25))
sns.heatmap(rbp_2, cmap="Spectral", vmin=-0.5, vmax=0.5)
#rbp.loc['ZRSR2']
rbp.loc['QKI']
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/ZRSR2_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
#ax.set_yscale('log')
ax.set_title('ZRSR2')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/ZFX_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
#ax.set_yscale('log')
ax.set_title('ZFX')
This gene is located on the X chromosome and is the corresponding locus to a Y-linked gene which encodes a tetratricopeptide repeat (TPR) protein. The encoded protein of this gene contains a JmjC-domain and catalyzes the demethylation of tri/dimethylated histone H3. Multiple alternatively spliced transcript variants have been found for this gene. [provided by RefSeq, Apr 2014]
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/KDM6A_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('KDM6A')
Upregulatin of this gene in female in context of sex dimorphism https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5769539/
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/RP13-36G14.4_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('RP13-36G14.4')
plotted are the TPM (log) of RBPs extracted from GTEx repository
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/SNRNP70_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('SNRNP70')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/HNRNPA1_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('HNRNPA1')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/YBX1_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('YBX1')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/FUS_tpm.csv')
f,axes = plt.subplots(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('FUS')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/HNRNPA2B1_tpm.csv')
f,axes = plt.subplots(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('HNRNPA2B1')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/SF1_tpm.csv')
f,axes = plt.subplots(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('SF1')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/HNRNPK_tpm.csv')
f,axes = plt.subplots(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('HNRNPK')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/PABPN1_tpm.csv')
f,axes = plt.subplots(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('PABPN1')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/HNRNPH1_tpm.csv')
f,axes = plt.subplots(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('HNRNPH1')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/HNRNPL_tpm.csv')
f,axes = plt.subplots(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('HNRNPL')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/XIST_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
ax.set_yscale('log')
ax.set_title('XIST')
tpm = pd.read_csv('/projects/kesara/collaborators/Peter/AS-SD/results/TPM/XIST_tpm.csv')
plt.figure(figsize=(25,5))
ax = sns.boxplot(x="Tissue", y="TPM", hue="Sex", data=tpm)
plt.xticks(rotation=90)
#ax.set_yscale('log')
ax.set_title('XIST')